library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(dplyr)
library(anytime)
library(gganimate)
library(gifski)
library(ggthemes)
library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(transformr)
## 
## Attaching package: 'transformr'
## 
## The following object is masked from 'package:sf':
## 
##     st_normalize
#change to your filepath when use
#import data set in dataframe format and do a basic filter.
df_origin<-read.csv("/Users/guangjitang/Downloads/uber data/cab_rides.csv")
df <- df_origin %>% filter(!is.na(price))
weather <- read.csv("/Users/guangjitang/Downloads/uber data/weather.csv")
length(df)
## [1] 10
load("/Users/guangjitang/test1/map.rda")
#get the middle spot of the polygon
load("/Users/guangjitang/test1/map.rda")
for (i in 1:length(map$Name)){
  g=map$geometry[[i]]
  g=g[[1]]
  map$long_mid[i]=mean(g[,1])
  map$lat_mid[i]=mean(g[,2])
}
# get the same name with df for future join
u=unique(df$source)
map$nname=NA
count_name=0
for (i in 1:length(map$Name)) {
  for (j in 1:length(u)){
    if (grepl(u[j],map$Name[i],ignore.case = T)){
      map$nname[i]=u[j]
      count_name=count_name+1
    }
  }
}
count_name
## [1] 11
#only 11 name was added sucessfully, add the last one manully
map$nname[6]="Haymarket Square" 
map2 <- map %>% 
  filter(!is.na(nname)) %>% 
  select(nname,geometry,long_mid,lat_mid)
df_sample <- df %>% sample_n(10000)# Sample for test only
df_plot <- df_sample %>% 
  mutate(location=source)%>% # If care only about the weather of the source
  mutate(time=anytime(time_stamp/1000))%>% #convert time stamp to time
  mutate(time_hour=substr(time,1,10))%>%  # select only the hours
  mutate(hour = as.numeric(substr(time,12,13)))
df_plot$part_of_time = "night"
df_plot$part_duration = 8
df_plot$part_of_time[df_plot$hour>5] = "morning"
df_plot$part_duration[df_plot$hour>5] = 5
df_plot$part_of_time[df_plot$hour>10] = "noon"
df_plot$part_duration[df_plot$hour>10] = 3
df_plot$part_of_time[df_plot$hour>13] = "afternoon"
df_plot$part_duration[df_plot$hour>13] = 5
df_plot$part_of_time[df_plot$hour>18] = "evening"
df_plot$part_duration[df_plot$hour>18] = 3
df_plot$part_of_time[df_plot$hour>21] = "night"
df_plot$part_duration[df_plot$hour>21] = 8
  
weather <- weather %>% 
  mutate(time=anytime(time_stamp)) %>% #convert time stamp to time
  mutate(time_hour=substr(time,1,10)) # select only the hours
df_plot <- merge(df_plot,weather,by=c("time_hour","location"))
df_plot <- df_plot %>%
  mutate(g_time=time.x-time.y) %>%
  group_by(id) %>%
  arrange(abs(g_time), .by_group = TRUE) %>%
  top_n(1, g_time) %>% # now connect only to the most recent weather data 
  select(-g_time)
df_plot <- df_plot %>%
  mutate(rainy=!is.na(rain))
#theme_set(theme_wsj())
ggplot(df_plot,aes(distance,price,color=rainy))+facet_wrap("cab_type")+geom_point(size=1,alpha=0.5)+geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

plot2=ggplot() +
  geom_sf(data = map, mapping = aes(fill = "Rest"),colour="grey",fill="white", show.legend = FALSE)+
  geom_sf(data = map2, mapping = aes(fill = nname),na.rm = T, show.legend = T)+
  coord_sf(xlim= c(-71.14,-71.03),ylim = c(42.31,42.39),lims_method="cross")+ 
  theme_map() +
  theme(legend.position = c(0.95, 0),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=10)) 
plot2

df_plot3 = df_plot %>% group_by(source) %>% count(source)
df_plot3 = df_plot3 %>% rename(source_n = n,nname=source)
df_plot3_temp = df_plot %>% group_by(destination) %>% count(destination)
df_plot3$destination_n = df_plot3_temp$n
df_plot3$difference = df_plot3$source_n -df_plot3$destination_n
map_plot3 = merge(map2,df_plot3,by="nname")
plot3 = plot2 + 
  geom_point(data = map_plot3,mapping = aes(x=long_mid,y=lat_mid,color = difference))+
  scale_color_gradient(low = "cyan",high = "red")
plot4 = ggplot() +
  geom_sf(data = map, mapping = aes(fill = "Rest"),colour="grey",fill="white", show.legend = FALSE)+
  geom_sf(data = map_plot3, mapping = aes(fill = difference))+
  scale_fill_gradientn(values = c(1,0.5,0), colours = c('cyan','white','red'))+
  coord_sf(xlim= c(-71.14,-71.03),ylim = c(42.31,42.39),lims_method="cross")+ 
  theme_map() +
  theme(legend.position = c(0.95, 0),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=10)) 
plot4

#difference by part_of_time
df_plot3_2 = df_plot %>% group_by(source,part_of_time) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'source'. You can override using the
## `.groups` argument.
df_plot3_2 = df_plot3_2 %>% rename(source_n = n,nname=source)
df_plot3_2_temp = df_plot %>% group_by(destination,part_of_time) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'destination'. You can override using the
## `.groups` argument.
df_plot3_2$destination_n = df_plot3_2_temp$n
df_plot3_2$difference = df_plot3_2$source_n -df_plot3_2$destination_n
map_plot3_2 = merge(map2,df_plot3_2,by="nname")
plot2 + facet_grid(~part_of_time)+
  geom_point(data = map_plot3_2,mapping = aes(x=long_mid,y=lat_mid,color = difference))+
  scale_color_gradient(low = "cyan",high = "red")+
  labs(title = "Cab out-in in Boston area by part of time in a day",
           caption = "Cyan for out>in",fill="Out - in")

#difference by hour
df_plot3_3 = df_plot %>% group_by(source,hour) %>% summarise(n=sum(1/part_duration),avg_price=mean(price/distance))
## `summarise()` has grouped output by 'source'. You can override using the
## `.groups` argument.
df_plot3_3 = df_plot3_3 %>% rename(source_n = n,nname=source)
df_plot3_3_temp = df_plot %>% group_by(destination,hour) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'destination'. You can override using the
## `.groups` argument.
df_plot3_3$destination_n = df_plot3_3_temp$n
df_plot3_3$difference = df_plot3_3$source_n -df_plot3_3$destination_n
map_plot3_3 = merge(map2,df_plot3_3,by="nname")


makeplot <- function(){
    datalist <- split(map_plot3_3, map_plot3_3$hour)
    #add overlap
    for(i in 0:23){
      datalist[[i+1]]$difference=0.75*datalist[[(i-1)%%24+1]]$difference+datalist[[i+1]]$difference+0.75*datalist[[(i+1)%%24+1]]$difference
    }
    x=lapply(datalist, function(data){
        p <- ggplot() +
          geom_sf(data = map, mapping = aes(fill = "Rest"),colour="grey",fill="white", show.legend = FALSE)+
          geom_sf(data = data, mapping = aes(fill = difference))+
          scale_fill_gradientn(values = c(1,0.5,0), colours = c('cyan','white','red'))+
          coord_sf(xlim= c(-71.14,-71.03),ylim = c(42.31,42.39),lims_method="cross")+ 
          theme_map() +
          theme(legend.position = c(0.95, 0),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=10)) +
          labs(title = "Cab out-in in Boston area",
           caption = "In and out",fill="Out - in")+
            
          labs(subtitle = paste("Hour: ",data$hour[1]))
        print(p)
    })
  }
gif_file <- "gif_1.gif"
save_gif(makeplot(), gif_file, 1280, 720, delay=0.5)
## [1] "/Users/guangjitang/test1/gif_1.gif"
utils::browseURL(gif_file)
df_plot5 <- df_plot 
plotlist=list()
for (h in 1:24){
  count_5=rep(0,12)
  for (i in 1:length(df_plot5$id)){
    hour = df_plot5$hour[i]
    count_5[hour+1]=count_5[hour+1]+1
  }
  #plotlist[[h]] = 
}
#price and distance by hour
ggplot(df_plot5,aes(distance,price,color=cab_type))+transition_time(hour)+geom_point(size=1,alpha=0.5)+geom_smooth()+
  coord_cartesian(ylim = c(0,50))+
  labs(title = "Cab out-in in Boston area",
           caption = "In and out",fill="Out - in")+
          labs(subtitle = "P: {frame_time}")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#price/distance by hour
ylim1 = boxplot.stats(df_plot5$price/df_plot5$distance)$stats[c(1, 5)]
ggplot(df_plot5,aes(hour,price/distance,color=cab_type,group=source))+geom_boxplot(size=1,alpha=0.5)+
  coord_cartesian(ylim = ylim1*1.5)+
  labs(title = "Cab out-in in Boston area",
           caption = "In and out",fill="Out - in")+
            
          labs(subtitle = "P: {frame_time}")

df_plot6 <- df_plot5 %>% group_by(source,part_of_time) %>% summarise(n=sum(1/part_duration)) %>% rename(nname = source)
## `summarise()` has grouped output by 'source'. You can override using the
## `.groups` argument.
map_plot6 <- left_join(map2,df_plot6,by="nname")
ggplot(data = map_plot6, mapping = aes(fill = n))+geom_sf()+facet_wrap(~part_of_time)

#hours and price by place
ggplot(map_plot3_3,mapping = aes(hour,avg_price))+
  geom_point(aes(color = nname))+
  geom_smooth(map_plot3_3,mapping = aes(hour,avg_price,color=nname),se=FALSE,size=1)+
  geom_smooth(map_plot3_3,mapping = aes(hour,avg_price),se=TRUE,size=2)+
  labs(title = "Cab hour and price in Boston area")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#hours and in_out by place
ggplot(map_plot3_3,mapping = aes(hour,difference))+
  geom_point(aes(color = nname))+
  geom_smooth(map_plot3_3,mapping = aes(color=nname),se=FALSE,size=1)+
  labs(title = "Cab hour and price in Boston area")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#in_out and price
ggplot(map_plot3_3,mapping = aes(difference,avg_price))+
  geom_point()+
  geom_smooth()+
  labs(title = "Cab out-in and price in Boston area")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#in_out and price by place
ggplot(map_plot3_3,mapping = aes(difference,avg_price,group=nname))+
  geom_point()+
  geom_smooth()+
  labs(title = "Cab out-in and price in Boston area")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'